We will read in our csv dataset. As instructed we will also split our data by data_channel_is_*.
online_new_popularity_data <- read.csv("./OnlineNewsPopularity/OnlineNewsPopularity.csv")
We will subset the data based on the category listed in our YAML. In this case we will be using data_channel_is_lifestyle. We will also remove non-predictors such as url and timedelta
#Subsetting our data based on the category parameter and dropping non-predictors
subset_data <- online_new_popularity_data %>%
filter(!!as.name(paste0("data_channel_is_",params$category)) == 1) %>%
select(-starts_with("data_channel_is")) %>% select(n_tokens_title:shares)
Next we will check for potential problematic values such as NA or infinity. These could result in errors with later analysis. Should a problem arise later on, this allows for a diagnoistic to rule out potential problematic values.
#Checking data for NA or infinite values
apply(subset_data, 2, function(x) any(is.na(x) | is.infinite(x)))
## n_tokens_title n_tokens_content
## FALSE FALSE
## n_unique_tokens n_non_stop_words
## FALSE FALSE
## n_non_stop_unique_tokens num_hrefs
## FALSE FALSE
## num_self_hrefs num_imgs
## FALSE FALSE
## num_videos average_token_length
## FALSE FALSE
## num_keywords kw_min_min
## FALSE FALSE
## kw_max_min kw_avg_min
## FALSE FALSE
## kw_min_max kw_max_max
## FALSE FALSE
## kw_avg_max kw_min_avg
## FALSE FALSE
## kw_max_avg kw_avg_avg
## FALSE FALSE
## self_reference_min_shares self_reference_max_shares
## FALSE FALSE
## self_reference_avg_sharess weekday_is_monday
## FALSE FALSE
## weekday_is_tuesday weekday_is_wednesday
## FALSE FALSE
## weekday_is_thursday weekday_is_friday
## FALSE FALSE
## weekday_is_saturday weekday_is_sunday
## FALSE FALSE
## is_weekend LDA_00
## FALSE FALSE
## LDA_01 LDA_02
## FALSE FALSE
## LDA_03 LDA_04
## FALSE FALSE
## global_subjectivity global_sentiment_polarity
## FALSE FALSE
## global_rate_positive_words global_rate_negative_words
## FALSE FALSE
## rate_positive_words rate_negative_words
## FALSE FALSE
## avg_positive_polarity min_positive_polarity
## FALSE FALSE
## max_positive_polarity avg_negative_polarity
## FALSE FALSE
## min_negative_polarity max_negative_polarity
## FALSE FALSE
## title_subjectivity title_sentiment_polarity
## FALSE FALSE
## abs_title_subjectivity abs_title_sentiment_polarity
## FALSE FALSE
## shares
## FALSE
#Setting up a simple 70/30 split for our already subset data
sample_size <- floor(0.7 * nrow(subset_data))
train_ind <- sample(seq_len(nrow(subset_data)), size = sample_size)
# This will be needed later on when we start modeling
training_data <- subset_data[train_ind,]
test_data <- subset_data[-train_ind,]
First let’s perform a simple numeric summary variables to calculate a six number summary for each variable from the training data set. This summary includes minimum, 1st quartile, median, mean, 3rd quartile, and maximum values. This provides a senses of scale and range for variable values.
summary(training_data)
## n_tokens_title n_tokens_content n_unique_tokens n_non_stop_words
## Min. : 3.00 Min. : 0.0 Min. :0.0000 Min. :0.0000
## 1st Qu.: 8.00 1st Qu.: 316.0 1st Qu.:0.4595 1st Qu.:1.0000
## Median :10.00 Median : 509.0 Median :0.5178 Median :1.0000
## Mean : 9.73 Mean : 631.9 Mean :0.5198 Mean :0.9884
## 3rd Qu.:11.00 3rd Qu.: 807.0 3rd Qu.:0.5891 3rd Qu.:1.0000
## Max. :17.00 Max. :8474.0 Max. :0.8382 Max. :1.0000
## n_non_stop_unique_tokens num_hrefs num_self_hrefs num_imgs
## Min. :0.0000 Min. : 0.0 Min. : 0.000 Min. : 0.000
## 1st Qu.:0.6222 1st Qu.: 6.0 1st Qu.: 1.000 1st Qu.: 1.000
## Median :0.6814 Median : 10.0 Median : 2.000 Median : 1.000
## Mean :0.6793 Mean : 13.6 Mean : 2.554 Mean : 4.941
## 3rd Qu.:0.7500 3rd Qu.: 18.0 3rd Qu.: 3.000 3rd Qu.: 8.000
## Max. :1.0000 Max. :118.0 Max. :27.000 Max. :111.000
## num_videos average_token_length num_keywords kw_min_min
## Min. : 0.000 Min. :0.000 Min. : 3.00 Min. : -1.00
## 1st Qu.: 0.000 1st Qu.:4.440 1st Qu.: 7.00 1st Qu.: -1.00
## Median : 0.000 Median :4.621 Median : 8.00 Median : 4.00
## Mean : 0.501 Mean :4.579 Mean : 8.26 Mean : 41.41
## 3rd Qu.: 0.000 3rd Qu.:4.795 3rd Qu.:10.00 3rd Qu.: 4.00
## Max. :50.000 Max. :5.947 Max. :10.00 Max. :377.00
## kw_max_min kw_avg_min kw_min_max kw_max_max
## Min. : 0 Min. : -1.0 Min. : 0 Min. : 0
## 1st Qu.: 503 1st Qu.: 186.6 1st Qu.: 0 1st Qu.:690400
## Median : 821 Median : 303.8 Median : 0 Median :843300
## Mean : 1639 Mean : 414.2 Mean : 7233 Mean :700117
## 3rd Qu.: 1300 3rd Qu.: 445.4 3rd Qu.: 6300 3rd Qu.:843300
## Max. :98700 Max. :14187.8 Max. :208300 Max. :843300
## kw_avg_max kw_min_avg kw_max_avg kw_avg_avg
## Min. : 0 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.:118050 1st Qu.: 0 1st Qu.: 4071 1st Qu.: 2627
## Median :181830 Median : 0 Median : 5015 Median : 3231
## Mean :183002 Mean :1051 Mean : 6610 Mean : 3402
## 3rd Qu.:248738 3rd Qu.:2226 3rd Qu.: 7200 3rd Qu.: 3923
## Max. :538744 Max. :3610 Max. :98700 Max. :20378
## self_reference_min_shares self_reference_max_shares self_reference_avg_sharess
## Min. : 0 Min. : 0 Min. : 0.0
## 1st Qu.: 555 1st Qu.: 883 1st Qu.: 875.3
## Median : 1600 Median : 2700 Median : 2446.5
## Mean : 4435 Mean : 8262 Mean : 6026.5
## 3rd Qu.: 3500 3rd Qu.: 6700 3rd Qu.: 5200.0
## Max. :144900 Max. :690400 Max. :401450.0
## weekday_is_monday weekday_is_tuesday weekday_is_wednesday weekday_is_thursday
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.1538 Mean :0.1586 Mean :0.1811 Mean :0.1715
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## weekday_is_friday weekday_is_saturday weekday_is_sunday is_weekend
## Min. :0.0000 Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.00000 Median :0.0000 Median :0.0000
## Mean :0.1477 Mean :0.08305 Mean :0.1042 Mean :0.1872
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.00000 Max. :1.0000 Max. :1.0000
## LDA_00 LDA_01 LDA_02 LDA_03
## Min. :0.01818 Min. :0.01819 Min. :0.01819 Min. :0.01820
## 1st Qu.:0.02250 1st Qu.:0.02223 1st Qu.:0.02222 1st Qu.:0.02255
## Median :0.02898 Median :0.02506 Median :0.02518 Median :0.02918
## Mean :0.17444 Mean :0.06642 Mean :0.07639 Mean :0.14917
## 3rd Qu.:0.23055 3rd Qu.:0.04001 3rd Qu.:0.05040 3rd Qu.:0.22134
## Max. :0.91980 Max. :0.68825 Max. :0.67623 Max. :0.91892
## LDA_04 global_subjectivity global_sentiment_polarity
## Min. :0.02014 Min. :0.0000 Min. :-0.3727
## 1st Qu.:0.31672 1st Qu.:0.4236 1st Qu.: 0.1001
## Median :0.57103 Median :0.4764 Median : 0.1491
## Mean :0.53358 Mean :0.4720 Mean : 0.1512
## 3rd Qu.:0.79982 3rd Qu.:0.5253 3rd Qu.: 0.2024
## Max. :0.92707 Max. :0.8667 Max. : 0.5800
## global_rate_positive_words global_rate_negative_words rate_positive_words
## Min. :0.00000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.03465 1st Qu.:0.01046 1st Qu.:0.6610
## Median :0.04370 Median :0.01552 Median :0.7368
## Mean :0.04443 Mean :0.01641 Mean :0.7208
## 3rd Qu.:0.05333 3rd Qu.:0.02115 3rd Qu.:0.8095
## Max. :0.12139 Max. :0.05785 Max. :1.0000
## rate_negative_words avg_positive_polarity min_positive_polarity
## Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.1864 1st Qu.:0.3367 1st Qu.:0.05000
## Median :0.2609 Median :0.3857 Median :0.10000
## Mean :0.2676 Mean :0.3833 Mean :0.09452
## 3rd Qu.:0.3333 3rd Qu.:0.4338 3rd Qu.:0.10000
## Max. :1.0000 Max. :0.7553 Max. :0.50000
## max_positive_polarity avg_negative_polarity min_negative_polarity
## Min. :0.000 Min. :-1.0000 Min. :-1.0000
## 1st Qu.:0.700 1st Qu.:-0.3213 1st Qu.:-0.7143
## Median :1.000 Median :-0.2585 Median :-0.5000
## Mean :0.831 Mean :-0.2660 Mean :-0.5621
## 3rd Qu.:1.000 3rd Qu.:-0.2024 3rd Qu.:-0.4000
## Max. :1.000 Max. : 0.0000 Max. : 0.0000
## max_negative_polarity title_subjectivity title_sentiment_polarity
## Min. :-1.0000 Min. :0.0000 Min. :-1.0000
## 1st Qu.:-0.1250 1st Qu.:0.0000 1st Qu.: 0.0000
## Median :-0.1000 Median :0.1000 Median : 0.0000
## Mean :-0.1032 Mean :0.2855 Mean : 0.1069
## 3rd Qu.:-0.0500 3rd Qu.:0.5000 3rd Qu.: 0.2143
## Max. : 0.0000 Max. :1.0000 Max. : 1.0000
## abs_title_subjectivity abs_title_sentiment_polarity shares
## Min. :0.0000 Min. :0.0000 Min. : 28
## 1st Qu.:0.2000 1st Qu.:0.0000 1st Qu.: 1100
## Median :0.5000 Median :0.0000 Median : 1700
## Mean :0.3531 Mean :0.1733 Mean : 3870
## 3rd Qu.:0.5000 3rd Qu.:0.3000 3rd Qu.: 3300
## Max. :0.5000 Max. :1.0000 Max. :208300
The previous section does not generate standard deviation for the variable values. Standard deviation is necessary for determining the variance of the response and predictors. It is a good diagnostic to spot potential issues that violate assumptions necessary for models and analysis.
options(scipen = 999)
train_SDs <- sapply(training_data, sd)
round(train_SDs, digits = 5)
## n_tokens_title n_tokens_content
## 1.88773 571.00621
## n_unique_tokens n_non_stop_words
## 0.10748 0.10699
## n_non_stop_unique_tokens num_hrefs
## 0.11717 11.37468
## num_self_hrefs num_imgs
## 2.84056 8.31547
## num_videos average_token_length
## 2.11025 0.56286
## num_keywords kw_min_min
## 1.62840 84.38354
## kw_max_min kw_avg_min
## 4955.48435 721.51552
## kw_min_max kw_max_max
## 17612.99545 263923.17579
## kw_avg_max kw_min_avg
## 96920.90061 1245.79140
## kw_max_avg kw_avg_avg
## 7369.40053 1387.40867
## self_reference_min_shares self_reference_max_shares
## 10907.31508 29240.28231
## self_reference_avg_sharess weekday_is_monday
## 16544.01003 0.36092
## weekday_is_tuesday weekday_is_wednesday
## 0.36544 0.38521
## weekday_is_thursday weekday_is_friday
## 0.37711 0.35494
## weekday_is_saturday weekday_is_sunday
## 0.27605 0.30556
## is_weekend LDA_00
## 0.39021 0.25000
## LDA_01 LDA_02
## 0.09507 0.10644
## LDA_03 LDA_04
## 0.20297 0.29391
## global_subjectivity global_sentiment_polarity
## 0.09634 0.08718
## global_rate_positive_words global_rate_negative_words
## 0.01525 0.00862
## rate_positive_words rate_negative_words
## 0.14470 0.12526
## avg_positive_polarity min_positive_polarity
## 0.08536 0.06489
## max_positive_polarity avg_negative_polarity
## 0.20946 0.10873
## min_negative_polarity max_negative_polarity
## 0.26938 0.08736
## title_subjectivity title_sentiment_polarity
## 0.33640 0.29121
## abs_title_subjectivity abs_title_sentiment_polarity
## 0.18564 0.25722
## shares
## 10135.85662
options(scipen = 0)
Although the 1st and 3rd quartiles are identified in the six number summary, it is helpful quantify the range between these two values, IQR. IQR is also needed for subsequent plotting. Binary response variables such as weekday_is_* and is_weekend have values of 0 given the nature of those predictors.
IQRs <- as_tibble(lapply(training_data, IQR))
IQR
## function (x, na.rm = FALSE, type = 7)
## diff(quantile(as.numeric(x), c(0.25, 0.75), na.rm = na.rm, names = FALSE,
## type = type))
## <bytecode: 0x7fbb2af40ce0>
## <environment: namespace:stats>
Prior to preforming any model fitting or statistical analysis it is essential to understand the potential correlation among predictors and between the response and predictors. Correlation helps identify potential collinearity and thus, allows for better candidate model selection. It is worth noting any absolute correlation values > 0.5. However, this threshold has been left to discretion of the individual. The correlation matrix has been further subset into shares vs predictor correlations and filtered at a threshold correlation value of 0.025.
variables <- as_tibble(attributes(training_data)$names) %>%
rename(variable = "value")
corr <- cor(training_data)
round(corr, 3)
## n_tokens_title n_tokens_content n_unique_tokens
## n_tokens_title 1.000 -0.004 -0.042
## n_tokens_content -0.004 1.000 -0.539
## n_unique_tokens -0.042 -0.539 1.000
## n_non_stop_words -0.049 0.120 0.524
## n_non_stop_unique_tokens -0.031 -0.369 0.905
## num_hrefs -0.095 0.330 -0.219
## num_self_hrefs 0.021 0.201 -0.123
## num_imgs -0.008 0.505 -0.267
## num_videos 0.009 0.041 0.004
## average_token_length -0.087 0.027 0.560
## num_keywords -0.070 0.097 -0.136
## kw_min_min -0.043 -0.114 0.098
## kw_max_min 0.040 0.017 -0.049
## kw_avg_min 0.016 -0.002 -0.018
## kw_min_max 0.009 0.055 -0.049
## kw_max_max 0.055 0.113 -0.098
## kw_avg_max 0.143 0.046 -0.095
## kw_min_avg -0.045 0.053 -0.003
## kw_max_avg 0.009 0.035 -0.052
## kw_avg_avg 0.023 0.067 -0.086
## self_reference_min_shares 0.061 0.004 0.026
## self_reference_max_shares 0.025 0.047 -0.016
## self_reference_avg_sharess 0.044 0.033 0.001
## weekday_is_monday 0.010 -0.036 0.016
## weekday_is_tuesday 0.037 -0.010 0.026
## weekday_is_wednesday 0.004 0.026 0.004
## weekday_is_thursday -0.001 -0.036 0.012
## weekday_is_friday -0.046 0.023 -0.007
## weekday_is_saturday 0.004 0.013 -0.013
## weekday_is_sunday -0.010 0.027 -0.050
## is_weekend -0.005 0.031 -0.048
## LDA_00 0.009 0.070 -0.055
## LDA_01 0.054 -0.025 0.036
## LDA_02 0.039 0.015 -0.024
## LDA_03 0.010 0.040 -0.061
## LDA_04 -0.046 -0.085 0.086
## global_subjectivity -0.083 0.097 0.225
## global_sentiment_polarity -0.119 0.068 0.035
## global_rate_positive_words -0.067 0.140 0.067
## global_rate_negative_words 0.030 0.056 0.097
## rate_positive_words -0.087 0.091 0.247
## rate_negative_words 0.059 -0.002 0.162
## avg_positive_polarity -0.108 0.086 0.232
## min_positive_polarity -0.038 -0.273 0.392
## max_positive_polarity -0.046 0.347 -0.098
## avg_negative_polarity -0.028 -0.098 -0.114
## min_negative_polarity -0.048 -0.384 0.194
## max_negative_polarity 0.005 0.244 -0.333
## title_subjectivity 0.017 -0.005 -0.056
## title_sentiment_polarity -0.019 0.008 -0.052
## abs_title_subjectivity -0.103 -0.025 0.023
## abs_title_sentiment_polarity 0.005 -0.008 -0.062
## shares 0.008 0.097 -0.041
## n_non_stop_words n_non_stop_unique_tokens
## n_tokens_title -0.049 -0.031
## n_tokens_content 0.120 -0.369
## n_unique_tokens 0.524 0.905
## n_non_stop_words 1.000 0.628
## n_non_stop_unique_tokens 0.628 1.000
## num_hrefs 0.129 -0.261
## num_self_hrefs 0.097 -0.100
## num_imgs -0.011 -0.379
## num_videos -0.004 0.002
## average_token_length 0.881 0.565
## num_keywords -0.026 -0.163
## kw_min_min 0.005 0.107
## kw_max_min -0.046 -0.049
## kw_avg_min -0.026 -0.018
## kw_min_max -0.015 -0.068
## kw_max_max -0.002 -0.099
## kw_avg_max -0.081 -0.145
## kw_min_avg 0.000 -0.069
## kw_max_avg -0.029 -0.087
## kw_avg_avg -0.056 -0.157
## self_reference_min_shares 0.044 0.018
## self_reference_max_shares 0.031 0.002
## self_reference_avg_sharess 0.039 0.012
## weekday_is_monday -0.024 0.009
## weekday_is_tuesday 0.012 0.047
## weekday_is_wednesday 0.018 0.005
## weekday_is_thursday -0.035 0.007
## weekday_is_friday 0.027 0.019
## weekday_is_saturday 0.009 -0.034
## weekday_is_sunday -0.005 -0.073
## is_weekend 0.003 -0.082
## LDA_00 0.004 0.000
## LDA_01 0.029 0.040
## LDA_02 0.003 0.000
## LDA_03 -0.110 -0.189
## LDA_04 0.063 0.118
## global_subjectivity 0.530 0.243
## global_sentiment_polarity 0.188 0.015
## global_rate_positive_words 0.315 0.130
## global_rate_negative_words 0.206 0.167
## rate_positive_words 0.539 0.289
## rate_negative_words 0.231 0.202
## avg_positive_polarity 0.486 0.253
## min_positive_polarity 0.158 0.287
## max_positive_polarity 0.429 0.021
## avg_negative_polarity -0.265 -0.159
## min_negative_polarity -0.226 0.040
## max_negative_polarity -0.128 -0.238
## title_subjectivity -0.086 -0.089
## title_sentiment_polarity -0.041 -0.091
## abs_title_subjectivity -0.004 0.025
## abs_title_sentiment_polarity -0.106 -0.098
## shares -0.024 -0.017
## num_hrefs num_self_hrefs num_imgs num_videos
## n_tokens_title -0.095 0.021 -0.008 0.009
## n_tokens_content 0.330 0.201 0.505 0.041
## n_unique_tokens -0.219 -0.123 -0.267 0.004
## n_non_stop_words 0.129 0.097 -0.011 -0.004
## n_non_stop_unique_tokens -0.261 -0.100 -0.379 0.002
## num_hrefs 1.000 0.264 0.447 0.041
## num_self_hrefs 0.264 1.000 0.198 0.011
## num_imgs 0.447 0.198 1.000 -0.055
## num_videos 0.041 0.011 -0.055 1.000
## average_token_length 0.158 0.053 -0.007 -0.008
## num_keywords 0.226 0.194 0.181 0.004
## kw_min_min -0.148 0.097 -0.097 -0.059
## kw_max_min 0.017 0.031 -0.008 0.060
## kw_avg_min -0.015 0.045 -0.034 0.046
## kw_min_max 0.105 -0.033 0.109 0.063
## kw_max_max 0.157 -0.121 0.097 0.052
## kw_avg_max 0.131 -0.146 0.137 0.122
## kw_min_avg 0.150 -0.042 0.192 0.099
## kw_max_avg 0.124 -0.008 0.113 0.040
## kw_avg_avg 0.220 -0.077 0.252 0.088
## self_reference_min_shares -0.013 -0.022 0.039 0.037
## self_reference_max_shares 0.031 0.105 0.023 0.069
## self_reference_avg_sharess 0.012 0.046 0.028 0.074
## weekday_is_monday -0.042 0.008 -0.049 0.051
## weekday_is_tuesday -0.045 -0.006 -0.079 -0.030
## weekday_is_wednesday -0.013 0.002 -0.003 -0.012
## weekday_is_thursday -0.078 -0.027 -0.058 -0.003
## weekday_is_friday 0.012 -0.006 -0.038 -0.022
## weekday_is_saturday 0.098 0.026 0.132 0.014
## weekday_is_sunday 0.114 0.012 0.154 0.007
## is_weekend 0.158 0.028 0.214 0.015
## LDA_00 0.051 -0.005 -0.062 0.010
## LDA_01 -0.058 0.090 -0.045 0.012
## LDA_02 -0.138 -0.111 -0.117 0.015
## LDA_03 0.281 -0.037 0.427 0.108
## LDA_04 -0.169 0.041 -0.185 -0.092
## global_subjectivity 0.272 0.040 0.212 0.016
## global_sentiment_polarity 0.206 0.084 0.175 0.004
## global_rate_positive_words 0.142 0.088 0.073 -0.016
## global_rate_negative_words -0.030 -0.061 -0.030 0.003
## rate_positive_words 0.145 0.123 0.034 -0.002
## rate_negative_words -0.057 -0.059 -0.048 -0.002
## avg_positive_polarity 0.246 0.026 0.194 0.027
## min_positive_polarity -0.092 -0.084 -0.016 0.029
## max_positive_polarity 0.313 0.109 0.184 0.041
## avg_negative_polarity -0.113 -0.030 -0.098 -0.016
## min_negative_polarity -0.197 -0.034 -0.123 -0.049
## max_negative_polarity 0.049 0.029 0.023 0.024
## title_subjectivity 0.063 -0.010 0.119 -0.004
## title_sentiment_polarity 0.052 -0.030 0.098 0.001
## abs_title_subjectivity 0.005 -0.028 -0.022 -0.026
## abs_title_sentiment_polarity 0.052 -0.022 0.091 0.022
## shares 0.058 -0.015 0.044 0.085
## average_token_length num_keywords kw_min_min
## n_tokens_title -0.087 -0.070 -0.043
## n_tokens_content 0.027 0.097 -0.114
## n_unique_tokens 0.560 -0.136 0.098
## n_non_stop_words 0.881 -0.026 0.005
## n_non_stop_unique_tokens 0.565 -0.163 0.107
## num_hrefs 0.158 0.226 -0.148
## num_self_hrefs 0.053 0.194 0.097
## num_imgs -0.007 0.181 -0.097
## num_videos -0.008 0.004 -0.059
## average_token_length 1.000 -0.038 -0.007
## num_keywords -0.038 1.000 0.023
## kw_min_min -0.007 0.023 1.000
## kw_max_min -0.032 0.059 -0.026
## kw_avg_min -0.015 0.046 0.076
## kw_min_max -0.019 -0.228 -0.150
## kw_max_max 0.009 -0.029 -0.865
## kw_avg_max -0.046 -0.245 -0.672
## kw_min_avg 0.002 -0.102 -0.211
## kw_max_avg -0.002 0.085 -0.159
## kw_avg_avg -0.025 0.028 -0.358
## self_reference_min_shares 0.038 -0.051 -0.092
## self_reference_max_shares 0.027 -0.013 -0.068
## self_reference_avg_sharess 0.032 -0.039 -0.089
## weekday_is_monday -0.033 -0.047 -0.035
## weekday_is_tuesday 0.019 -0.058 0.003
## weekday_is_wednesday 0.030 0.032 0.044
## weekday_is_thursday -0.019 -0.038 0.005
## weekday_is_friday 0.024 0.005 0.077
## weekday_is_saturday 0.000 0.044 -0.077
## weekday_is_sunday -0.026 0.085 -0.045
## is_weekend -0.020 0.098 -0.090
## LDA_00 -0.013 -0.032 -0.145
## LDA_01 0.045 -0.053 0.034
## LDA_02 0.054 -0.105 -0.070
## LDA_03 -0.081 0.147 -0.146
## LDA_04 0.033 -0.019 0.239
## global_subjectivity 0.425 0.070 -0.052
## global_sentiment_polarity 0.140 0.123 0.020
## global_rate_positive_words 0.238 0.068 -0.007
## global_rate_negative_words 0.126 -0.004 -0.050
## rate_positive_words 0.494 0.022 0.042
## rate_negative_words 0.182 -0.047 -0.044
## avg_positive_polarity 0.370 0.115 -0.053
## min_positive_polarity 0.151 -0.049 0.022
## max_positive_polarity 0.297 0.135 -0.132
## avg_negative_polarity -0.196 0.019 0.042
## min_negative_polarity -0.112 0.014 0.100
## max_negative_polarity -0.157 0.036 -0.031
## title_subjectivity -0.089 0.004 -0.018
## title_sentiment_polarity -0.036 0.071 -0.032
## abs_title_subjectivity 0.030 0.012 -0.035
## abs_title_sentiment_polarity -0.117 0.004 -0.056
## shares -0.020 0.005 -0.040
## kw_max_min kw_avg_min kw_min_max kw_max_max
## n_tokens_title 0.040 0.016 0.009 0.055
## n_tokens_content 0.017 -0.002 0.055 0.113
## n_unique_tokens -0.049 -0.018 -0.049 -0.098
## n_non_stop_words -0.046 -0.026 -0.015 -0.002
## n_non_stop_unique_tokens -0.049 -0.018 -0.068 -0.099
## num_hrefs 0.017 -0.015 0.105 0.157
## num_self_hrefs 0.031 0.045 -0.033 -0.121
## num_imgs -0.008 -0.034 0.109 0.097
## num_videos 0.060 0.046 0.063 0.052
## average_token_length -0.032 -0.015 -0.019 0.009
## num_keywords 0.059 0.046 -0.228 -0.029
## kw_min_min -0.026 0.076 -0.150 -0.865
## kw_max_min 1.000 0.964 0.010 0.027
## kw_avg_min 0.964 1.000 -0.026 -0.077
## kw_min_max 0.010 -0.026 1.000 0.169
## kw_max_max 0.027 -0.077 0.169 1.000
## kw_avg_max -0.006 -0.128 0.383 0.751
## kw_min_avg 0.025 -0.025 0.609 0.255
## kw_max_avg 0.644 0.610 0.128 0.174
## kw_avg_avg 0.462 0.401 0.368 0.411
## self_reference_min_shares 0.159 0.156 0.083 0.100
## self_reference_max_shares 0.064 0.060 0.025 0.060
## self_reference_avg_sharess 0.110 0.107 0.052 0.086
## weekday_is_monday 0.032 0.012 -0.037 0.009
## weekday_is_tuesday 0.004 0.015 -0.012 -0.002
## weekday_is_wednesday -0.012 -0.008 -0.026 -0.053
## weekday_is_thursday 0.014 0.026 -0.029 -0.012
## weekday_is_friday -0.017 0.000 -0.016 -0.066
## weekday_is_saturday -0.016 -0.031 0.076 0.078
## weekday_is_sunday -0.010 -0.025 0.077 0.080
## is_weekend -0.019 -0.042 0.114 0.118
## LDA_00 0.037 0.031 -0.019 0.155
## LDA_01 -0.003 0.013 -0.086 -0.033
## LDA_02 -0.004 -0.003 -0.067 0.047
## LDA_03 0.044 0.016 0.197 0.146
## LDA_04 -0.060 -0.040 -0.068 -0.239
## global_subjectivity -0.054 -0.064 0.040 0.053
## global_sentiment_polarity -0.025 -0.024 -0.016 -0.022
## global_rate_positive_words -0.011 -0.008 0.006 -0.011
## global_rate_negative_words -0.026 -0.032 0.035 0.045
## rate_positive_words -0.010 0.009 -0.037 -0.040
## rate_negative_words -0.028 -0.033 0.030 0.044
## avg_positive_polarity -0.041 -0.047 0.000 0.051
## min_positive_polarity -0.022 -0.014 0.006 -0.045
## max_positive_polarity 0.000 -0.018 0.039 0.136
## avg_negative_polarity 0.012 0.021 -0.015 -0.048
## min_negative_polarity 0.006 0.024 -0.049 -0.111
## max_negative_polarity -0.001 -0.009 0.027 0.024
## title_subjectivity 0.013 0.003 0.031 0.009
## title_sentiment_polarity 0.068 0.057 0.032 0.016
## abs_title_subjectivity -0.008 -0.014 0.003 0.035
## abs_title_sentiment_polarity 0.062 0.043 0.046 0.042
## shares 0.013 0.009 0.018 0.043
## kw_avg_max kw_min_avg kw_max_avg kw_avg_avg
## n_tokens_title 0.143 -0.045 0.009 0.023
## n_tokens_content 0.046 0.053 0.035 0.067
## n_unique_tokens -0.095 -0.003 -0.052 -0.086
## n_non_stop_words -0.081 0.000 -0.029 -0.056
## n_non_stop_unique_tokens -0.145 -0.069 -0.087 -0.157
## num_hrefs 0.131 0.150 0.124 0.220
## num_self_hrefs -0.146 -0.042 -0.008 -0.077
## num_imgs 0.137 0.192 0.113 0.252
## num_videos 0.122 0.099 0.040 0.088
## average_token_length -0.046 0.002 -0.002 -0.025
## num_keywords -0.245 -0.102 0.085 0.028
## kw_min_min -0.672 -0.211 -0.159 -0.358
## kw_max_min -0.006 0.025 0.644 0.462
## kw_avg_min -0.128 -0.025 0.610 0.401
## kw_min_max 0.383 0.609 0.128 0.368
## kw_max_max 0.751 0.255 0.174 0.411
## kw_avg_max 1.000 0.410 0.193 0.509
## kw_min_avg 0.410 1.000 0.116 0.484
## kw_max_avg 0.193 0.116 1.000 0.820
## kw_avg_avg 0.509 0.484 0.820 1.000
## self_reference_min_shares 0.106 0.064 0.197 0.244
## self_reference_max_shares 0.037 0.021 0.062 0.079
## self_reference_avg_sharess 0.072 0.041 0.123 0.154
## weekday_is_monday 0.010 -0.016 -0.013 -0.035
## weekday_is_tuesday -0.016 -0.072 -0.024 -0.046
## weekday_is_wednesday -0.030 -0.009 0.024 -0.001
## weekday_is_thursday -0.009 -0.025 0.007 -0.011
## weekday_is_friday -0.090 -0.059 -0.047 -0.078
## weekday_is_saturday 0.114 0.079 0.041 0.105
## weekday_is_sunday 0.058 0.145 0.022 0.107
## is_weekend 0.126 0.169 0.046 0.158
## LDA_00 0.088 -0.015 0.042 0.051
## LDA_01 -0.076 -0.114 -0.018 -0.075
## LDA_02 0.085 -0.063 -0.056 -0.094
## LDA_03 0.308 0.269 0.182 0.408
## LDA_04 -0.294 -0.113 -0.135 -0.267
## global_subjectivity 0.028 0.109 0.024 0.092
## global_sentiment_polarity -0.090 0.021 0.002 0.009
## global_rate_positive_words -0.041 -0.004 0.032 0.039
## global_rate_negative_words 0.092 0.050 0.012 0.063
## rate_positive_words -0.139 -0.050 -0.016 -0.068
## rate_negative_words 0.091 0.057 -0.006 0.031
## avg_positive_polarity -0.004 0.088 0.020 0.068
## min_positive_polarity -0.011 0.069 -0.005 0.013
## max_positive_polarity 0.056 0.077 0.074 0.121
## avg_negative_polarity -0.093 -0.057 -0.037 -0.090
## min_negative_polarity -0.135 -0.051 -0.040 -0.102
## max_negative_polarity -0.025 -0.031 -0.017 -0.026
## title_subjectivity 0.076 0.034 0.048 0.090
## title_sentiment_polarity 0.013 0.025 0.055 0.080
## abs_title_subjectivity -0.007 0.024 0.012 0.011
## abs_title_sentiment_polarity 0.103 0.050 0.065 0.113
## shares 0.036 0.021 0.044 0.089
## self_reference_min_shares
## n_tokens_title 0.061
## n_tokens_content 0.004
## n_unique_tokens 0.026
## n_non_stop_words 0.044
## n_non_stop_unique_tokens 0.018
## num_hrefs -0.013
## num_self_hrefs -0.022
## num_imgs 0.039
## num_videos 0.037
## average_token_length 0.038
## num_keywords -0.051
## kw_min_min -0.092
## kw_max_min 0.159
## kw_avg_min 0.156
## kw_min_max 0.083
## kw_max_max 0.100
## kw_avg_max 0.106
## kw_min_avg 0.064
## kw_max_avg 0.197
## kw_avg_avg 0.244
## self_reference_min_shares 1.000
## self_reference_max_shares 0.488
## self_reference_avg_sharess 0.766
## weekday_is_monday 0.006
## weekday_is_tuesday -0.008
## weekday_is_wednesday 0.017
## weekday_is_thursday -0.012
## weekday_is_friday -0.008
## weekday_is_saturday -0.015
## weekday_is_sunday 0.018
## is_weekend 0.004
## LDA_00 -0.014
## LDA_01 -0.005
## LDA_02 0.013
## LDA_03 0.073
## LDA_04 -0.042
## global_subjectivity 0.067
## global_sentiment_polarity -0.049
## global_rate_positive_words -0.038
## global_rate_negative_words 0.072
## rate_positive_words -0.045
## rate_negative_words 0.090
## avg_positive_polarity 0.033
## min_positive_polarity 0.025
## max_positive_polarity 0.043
## avg_negative_polarity -0.071
## min_negative_polarity -0.081
## max_negative_polarity -0.006
## title_subjectivity 0.032
## title_sentiment_polarity -0.025
## abs_title_subjectivity -0.030
## abs_title_sentiment_polarity 0.027
## shares 0.067
## self_reference_max_shares
## n_tokens_title 0.025
## n_tokens_content 0.047
## n_unique_tokens -0.016
## n_non_stop_words 0.031
## n_non_stop_unique_tokens 0.002
## num_hrefs 0.031
## num_self_hrefs 0.105
## num_imgs 0.023
## num_videos 0.069
## average_token_length 0.027
## num_keywords -0.013
## kw_min_min -0.068
## kw_max_min 0.064
## kw_avg_min 0.060
## kw_min_max 0.025
## kw_max_max 0.060
## kw_avg_max 0.037
## kw_min_avg 0.021
## kw_max_avg 0.062
## kw_avg_avg 0.079
## self_reference_min_shares 0.488
## self_reference_max_shares 1.000
## self_reference_avg_sharess 0.923
## weekday_is_monday -0.008
## weekday_is_tuesday -0.018
## weekday_is_wednesday 0.037
## weekday_is_thursday -0.022
## weekday_is_friday -0.016
## weekday_is_saturday -0.007
## weekday_is_sunday 0.037
## is_weekend 0.024
## LDA_00 0.035
## LDA_01 0.003
## LDA_02 -0.024
## LDA_03 0.019
## LDA_04 -0.035
## global_subjectivity 0.061
## global_sentiment_polarity -0.019
## global_rate_positive_words 0.008
## global_rate_negative_words 0.047
## rate_positive_words -0.015
## rate_negative_words 0.043
## avg_positive_polarity 0.033
## min_positive_polarity -0.024
## max_positive_polarity 0.067
## avg_negative_polarity -0.066
## min_negative_polarity -0.063
## max_negative_polarity 0.025
## title_subjectivity 0.036
## title_sentiment_polarity -0.087
## abs_title_subjectivity -0.037
## abs_title_sentiment_polarity 0.032
## shares 0.016
## self_reference_avg_sharess weekday_is_monday
## n_tokens_title 0.044 0.010
## n_tokens_content 0.033 -0.036
## n_unique_tokens 0.001 0.016
## n_non_stop_words 0.039 -0.024
## n_non_stop_unique_tokens 0.012 0.009
## num_hrefs 0.012 -0.042
## num_self_hrefs 0.046 0.008
## num_imgs 0.028 -0.049
## num_videos 0.074 0.051
## average_token_length 0.032 -0.033
## num_keywords -0.039 -0.047
## kw_min_min -0.089 -0.035
## kw_max_min 0.110 0.032
## kw_avg_min 0.107 0.012
## kw_min_max 0.052 -0.037
## kw_max_max 0.086 0.009
## kw_avg_max 0.072 0.010
## kw_min_avg 0.041 -0.016
## kw_max_avg 0.123 -0.013
## kw_avg_avg 0.154 -0.035
## self_reference_min_shares 0.766 0.006
## self_reference_max_shares 0.923 -0.008
## self_reference_avg_sharess 1.000 -0.004
## weekday_is_monday -0.004 1.000
## weekday_is_tuesday -0.016 -0.185
## weekday_is_wednesday 0.037 -0.201
## weekday_is_thursday -0.018 -0.194
## weekday_is_friday -0.012 -0.178
## weekday_is_saturday -0.013 -0.128
## weekday_is_sunday 0.026 -0.145
## is_weekend 0.011 -0.205
## LDA_00 0.019 0.035
## LDA_01 0.001 -0.016
## LDA_02 -0.012 -0.038
## LDA_03 0.044 -0.094
## LDA_04 -0.043 0.054
## global_subjectivity 0.073 0.027
## global_sentiment_polarity -0.036 0.054
## global_rate_positive_words -0.013 -0.002
## global_rate_negative_words 0.065 -0.049
## rate_positive_words -0.032 0.006
## rate_negative_words 0.070 -0.028
## avg_positive_polarity 0.037 0.000
## min_positive_polarity -0.005 -0.008
## max_positive_polarity 0.062 -0.002
## avg_negative_polarity -0.077 -0.008
## min_negative_polarity -0.076 0.009
## max_negative_polarity 0.014 -0.009
## title_subjectivity 0.039 0.004
## title_sentiment_polarity -0.072 0.004
## abs_title_subjectivity -0.044 -0.031
## abs_title_sentiment_polarity 0.033 0.000
## shares 0.036 0.039
## weekday_is_tuesday weekday_is_wednesday
## n_tokens_title 0.037 0.004
## n_tokens_content -0.010 0.026
## n_unique_tokens 0.026 0.004
## n_non_stop_words 0.012 0.018
## n_non_stop_unique_tokens 0.047 0.005
## num_hrefs -0.045 -0.013
## num_self_hrefs -0.006 0.002
## num_imgs -0.079 -0.003
## num_videos -0.030 -0.012
## average_token_length 0.019 0.030
## num_keywords -0.058 0.032
## kw_min_min 0.003 0.044
## kw_max_min 0.004 -0.012
## kw_avg_min 0.015 -0.008
## kw_min_max -0.012 -0.026
## kw_max_max -0.002 -0.053
## kw_avg_max -0.016 -0.030
## kw_min_avg -0.072 -0.009
## kw_max_avg -0.024 0.024
## kw_avg_avg -0.046 -0.001
## self_reference_min_shares -0.008 0.017
## self_reference_max_shares -0.018 0.037
## self_reference_avg_sharess -0.016 0.037
## weekday_is_monday -0.185 -0.201
## weekday_is_tuesday 1.000 -0.204
## weekday_is_wednesday -0.204 1.000
## weekday_is_thursday -0.198 -0.214
## weekday_is_friday -0.181 -0.196
## weekday_is_saturday -0.131 -0.142
## weekday_is_sunday -0.148 -0.160
## is_weekend -0.208 -0.226
## LDA_00 0.055 0.002
## LDA_01 0.018 -0.008
## LDA_02 0.004 0.054
## LDA_03 -0.064 -0.042
## LDA_04 -0.010 0.010
## global_subjectivity -0.057 -0.036
## global_sentiment_polarity -0.065 -0.068
## global_rate_positive_words -0.006 -0.037
## global_rate_negative_words 0.029 0.024
## rate_positive_words -0.009 -0.015
## rate_negative_words 0.020 0.033
## avg_positive_polarity -0.051 -0.023
## min_positive_polarity -0.048 0.015
## max_positive_polarity -0.047 -0.023
## avg_negative_polarity -0.013 0.005
## min_negative_polarity -0.027 -0.002
## max_negative_polarity -0.007 -0.001
## title_subjectivity 0.004 -0.057
## title_sentiment_polarity -0.023 -0.038
## abs_title_subjectivity 0.001 0.024
## abs_title_sentiment_polarity 0.010 -0.019
## shares 0.028 -0.025
## weekday_is_thursday weekday_is_friday
## n_tokens_title -0.001 -0.046
## n_tokens_content -0.036 0.023
## n_unique_tokens 0.012 -0.007
## n_non_stop_words -0.035 0.027
## n_non_stop_unique_tokens 0.007 0.019
## num_hrefs -0.078 0.012
## num_self_hrefs -0.027 -0.006
## num_imgs -0.058 -0.038
## num_videos -0.003 -0.022
## average_token_length -0.019 0.024
## num_keywords -0.038 0.005
## kw_min_min 0.005 0.077
## kw_max_min 0.014 -0.017
## kw_avg_min 0.026 0.000
## kw_min_max -0.029 -0.016
## kw_max_max -0.012 -0.066
## kw_avg_max -0.009 -0.090
## kw_min_avg -0.025 -0.059
## kw_max_avg 0.007 -0.047
## kw_avg_avg -0.011 -0.078
## self_reference_min_shares -0.012 -0.008
## self_reference_max_shares -0.022 -0.016
## self_reference_avg_sharess -0.018 -0.012
## weekday_is_monday -0.194 -0.178
## weekday_is_tuesday -0.198 -0.181
## weekday_is_wednesday -0.214 -0.196
## weekday_is_thursday 1.000 -0.189
## weekday_is_friday -0.189 1.000
## weekday_is_saturday -0.137 -0.125
## weekday_is_sunday -0.155 -0.142
## is_weekend -0.218 -0.200
## LDA_00 0.041 -0.048
## LDA_01 0.055 -0.036
## LDA_02 0.035 0.052
## LDA_03 -0.007 -0.057
## LDA_04 -0.060 0.073
## global_subjectivity -0.020 -0.040
## global_sentiment_polarity -0.020 -0.012
## global_rate_positive_words -0.026 -0.011
## global_rate_negative_words 0.021 -0.039
## rate_positive_words -0.046 0.040
## rate_negative_words 0.023 -0.023
## avg_positive_polarity -0.037 -0.030
## min_positive_polarity 0.015 0.005
## max_positive_polarity -0.048 0.008
## avg_negative_polarity 0.051 -0.002
## min_negative_polarity 0.027 0.017
## max_negative_polarity 0.035 -0.006
## title_subjectivity -0.002 -0.080
## title_sentiment_polarity -0.028 -0.017
## abs_title_subjectivity 0.011 0.055
## abs_title_sentiment_polarity 0.002 -0.076
## shares -0.003 -0.027
## weekday_is_saturday weekday_is_sunday is_weekend
## n_tokens_title 0.004 -0.010 -0.005
## n_tokens_content 0.013 0.027 0.031
## n_unique_tokens -0.013 -0.050 -0.048
## n_non_stop_words 0.009 -0.005 0.003
## n_non_stop_unique_tokens -0.034 -0.073 -0.082
## num_hrefs 0.098 0.114 0.158
## num_self_hrefs 0.026 0.012 0.028
## num_imgs 0.132 0.154 0.214
## num_videos 0.014 0.007 0.015
## average_token_length 0.000 -0.026 -0.020
## num_keywords 0.044 0.085 0.098
## kw_min_min -0.077 -0.045 -0.090
## kw_max_min -0.016 -0.010 -0.019
## kw_avg_min -0.031 -0.025 -0.042
## kw_min_max 0.076 0.077 0.114
## kw_max_max 0.078 0.080 0.118
## kw_avg_max 0.114 0.058 0.126
## kw_min_avg 0.079 0.145 0.169
## kw_max_avg 0.041 0.022 0.046
## kw_avg_avg 0.105 0.107 0.158
## self_reference_min_shares -0.015 0.018 0.004
## self_reference_max_shares -0.007 0.037 0.024
## self_reference_avg_sharess -0.013 0.026 0.011
## weekday_is_monday -0.128 -0.145 -0.205
## weekday_is_tuesday -0.131 -0.148 -0.208
## weekday_is_wednesday -0.142 -0.160 -0.226
## weekday_is_thursday -0.137 -0.155 -0.218
## weekday_is_friday -0.125 -0.142 -0.200
## weekday_is_saturday 1.000 -0.103 0.627
## weekday_is_sunday -0.103 1.000 0.710
## is_weekend 0.627 0.710 1.000
## LDA_00 -0.024 -0.082 -0.081
## LDA_01 0.024 -0.041 -0.015
## LDA_02 -0.052 -0.085 -0.103
## LDA_03 0.153 0.177 0.247
## LDA_04 -0.074 -0.008 -0.059
## global_subjectivity 0.084 0.077 0.120
## global_sentiment_polarity 0.044 0.098 0.108
## global_rate_positive_words 0.039 0.064 0.077
## global_rate_negative_words 0.004 0.009 0.010
## rate_positive_words 0.020 0.015 0.026
## rate_negative_words -0.015 -0.021 -0.027
## avg_positive_polarity 0.052 0.123 0.133
## min_positive_polarity -0.014 0.037 0.019
## max_positive_polarity 0.044 0.098 0.108
## avg_negative_polarity -0.039 -0.008 -0.034
## min_negative_polarity -0.007 -0.024 -0.024
## max_negative_polarity -0.020 0.001 -0.013
## title_subjectivity 0.091 0.075 0.124
## title_sentiment_polarity 0.059 0.070 0.097
## abs_title_subjectivity -0.026 -0.048 -0.056
## abs_title_sentiment_polarity 0.049 0.053 0.076
## shares -0.009 -0.005 -0.010
## LDA_00 LDA_01 LDA_02 LDA_03 LDA_04
## n_tokens_title 0.009 0.054 0.039 0.010 -0.046
## n_tokens_content 0.070 -0.025 0.015 0.040 -0.085
## n_unique_tokens -0.055 0.036 -0.024 -0.061 0.086
## n_non_stop_words 0.004 0.029 0.003 -0.110 0.063
## n_non_stop_unique_tokens 0.000 0.040 0.000 -0.189 0.118
## num_hrefs 0.051 -0.058 -0.138 0.281 -0.169
## num_self_hrefs -0.005 0.090 -0.111 -0.037 0.041
## num_imgs -0.062 -0.045 -0.117 0.427 -0.185
## num_videos 0.010 0.012 0.015 0.108 -0.092
## average_token_length -0.013 0.045 0.054 -0.081 0.033
## num_keywords -0.032 -0.053 -0.105 0.147 -0.019
## kw_min_min -0.145 0.034 -0.070 -0.146 0.239
## kw_max_min 0.037 -0.003 -0.004 0.044 -0.060
## kw_avg_min 0.031 0.013 -0.003 0.016 -0.040
## kw_min_max -0.019 -0.086 -0.067 0.197 -0.068
## kw_max_max 0.155 -0.033 0.047 0.146 -0.239
## kw_avg_max 0.088 -0.076 0.085 0.308 -0.294
## kw_min_avg -0.015 -0.114 -0.063 0.269 -0.113
## kw_max_avg 0.042 -0.018 -0.056 0.182 -0.135
## kw_avg_avg 0.051 -0.075 -0.094 0.408 -0.267
## self_reference_min_shares -0.014 -0.005 0.013 0.073 -0.042
## self_reference_max_shares 0.035 0.003 -0.024 0.019 -0.035
## self_reference_avg_sharess 0.019 0.001 -0.012 0.044 -0.043
## weekday_is_monday 0.035 -0.016 -0.038 -0.094 0.054
## weekday_is_tuesday 0.055 0.018 0.004 -0.064 -0.010
## weekday_is_wednesday 0.002 -0.008 0.054 -0.042 0.010
## weekday_is_thursday 0.041 0.055 0.035 -0.007 -0.060
## weekday_is_friday -0.048 -0.036 0.052 -0.057 0.073
## weekday_is_saturday -0.024 0.024 -0.052 0.153 -0.074
## weekday_is_sunday -0.082 -0.041 -0.085 0.177 -0.008
## is_weekend -0.081 -0.015 -0.103 0.247 -0.059
## LDA_00 1.000 -0.095 -0.100 -0.140 -0.687
## LDA_01 -0.095 1.000 -0.073 -0.115 -0.137
## LDA_02 -0.100 -0.073 1.000 -0.179 -0.130
## LDA_03 -0.140 -0.115 -0.179 1.000 -0.469
## LDA_04 -0.687 -0.137 -0.130 -0.469 1.000
## global_subjectivity 0.016 -0.025 -0.124 0.158 -0.070
## global_sentiment_polarity 0.054 -0.073 -0.116 0.081 -0.036
## global_rate_positive_words 0.074 -0.033 -0.087 0.043 -0.051
## global_rate_negative_words -0.002 0.070 0.005 0.074 -0.074
## rate_positive_words 0.048 -0.045 -0.027 -0.106 0.057
## rate_negative_words -0.052 0.077 0.034 0.028 -0.012
## avg_positive_polarity 0.045 -0.030 -0.109 0.147 -0.090
## min_positive_polarity -0.080 0.034 -0.015 0.070 0.014
## max_positive_polarity 0.087 -0.030 -0.046 0.090 -0.110
## avg_negative_polarity -0.003 -0.010 -0.004 -0.105 0.079
## min_negative_polarity -0.037 0.005 -0.042 -0.055 0.083
## max_negative_polarity 0.031 0.001 0.043 -0.079 0.013
## title_subjectivity 0.029 -0.027 -0.068 0.139 -0.087
## title_sentiment_polarity 0.050 -0.059 -0.023 0.093 -0.079
## abs_title_subjectivity -0.013 -0.005 0.012 0.018 -0.004
## abs_title_sentiment_polarity 0.046 -0.056 -0.058 0.133 -0.092
## shares 0.032 -0.016 -0.036 0.064 -0.053
## global_subjectivity global_sentiment_polarity
## n_tokens_title -0.083 -0.119
## n_tokens_content 0.097 0.068
## n_unique_tokens 0.225 0.035
## n_non_stop_words 0.530 0.188
## n_non_stop_unique_tokens 0.243 0.015
## num_hrefs 0.272 0.206
## num_self_hrefs 0.040 0.084
## num_imgs 0.212 0.175
## num_videos 0.016 0.004
## average_token_length 0.425 0.140
## num_keywords 0.070 0.123
## kw_min_min -0.052 0.020
## kw_max_min -0.054 -0.025
## kw_avg_min -0.064 -0.024
## kw_min_max 0.040 -0.016
## kw_max_max 0.053 -0.022
## kw_avg_max 0.028 -0.090
## kw_min_avg 0.109 0.021
## kw_max_avg 0.024 0.002
## kw_avg_avg 0.092 0.009
## self_reference_min_shares 0.067 -0.049
## self_reference_max_shares 0.061 -0.019
## self_reference_avg_sharess 0.073 -0.036
## weekday_is_monday 0.027 0.054
## weekday_is_tuesday -0.057 -0.065
## weekday_is_wednesday -0.036 -0.068
## weekday_is_thursday -0.020 -0.020
## weekday_is_friday -0.040 -0.012
## weekday_is_saturday 0.084 0.044
## weekday_is_sunday 0.077 0.098
## is_weekend 0.120 0.108
## LDA_00 0.016 0.054
## LDA_01 -0.025 -0.073
## LDA_02 -0.124 -0.116
## LDA_03 0.158 0.081
## LDA_04 -0.070 -0.036
## global_subjectivity 1.000 0.396
## global_sentiment_polarity 0.396 1.000
## global_rate_positive_words 0.377 0.581
## global_rate_negative_words 0.170 -0.436
## rate_positive_words 0.340 0.709
## rate_negative_words 0.060 -0.659
## avg_positive_polarity 0.608 0.558
## min_positive_polarity 0.202 0.063
## max_positive_polarity 0.418 0.453
## avg_negative_polarity -0.349 0.203
## min_negative_polarity -0.300 0.230
## max_negative_polarity -0.090 -0.051
## title_subjectivity 0.143 0.075
## title_sentiment_polarity 0.055 0.210
## abs_title_subjectivity -0.040 -0.079
## abs_title_sentiment_polarity 0.089 0.111
## shares 0.017 -0.021
## global_rate_positive_words
## n_tokens_title -0.067
## n_tokens_content 0.140
## n_unique_tokens 0.067
## n_non_stop_words 0.315
## n_non_stop_unique_tokens 0.130
## num_hrefs 0.142
## num_self_hrefs 0.088
## num_imgs 0.073
## num_videos -0.016
## average_token_length 0.238
## num_keywords 0.068
## kw_min_min -0.007
## kw_max_min -0.011
## kw_avg_min -0.008
## kw_min_max 0.006
## kw_max_max -0.011
## kw_avg_max -0.041
## kw_min_avg -0.004
## kw_max_avg 0.032
## kw_avg_avg 0.039
## self_reference_min_shares -0.038
## self_reference_max_shares 0.008
## self_reference_avg_sharess -0.013
## weekday_is_monday -0.002
## weekday_is_tuesday -0.006
## weekday_is_wednesday -0.037
## weekday_is_thursday -0.026
## weekday_is_friday -0.011
## weekday_is_saturday 0.039
## weekday_is_sunday 0.064
## is_weekend 0.077
## LDA_00 0.074
## LDA_01 -0.033
## LDA_02 -0.087
## LDA_03 0.043
## LDA_04 -0.051
## global_subjectivity 0.377
## global_sentiment_polarity 0.581
## global_rate_positive_words 1.000
## global_rate_negative_words 0.071
## rate_positive_words 0.570
## rate_negative_words -0.389
## avg_positive_polarity 0.271
## min_positive_polarity -0.161
## max_positive_polarity 0.441
## avg_negative_polarity -0.066
## min_negative_polarity -0.126
## max_negative_polarity -0.017
## title_subjectivity 0.106
## title_sentiment_polarity 0.123
## abs_title_subjectivity -0.135
## abs_title_sentiment_polarity 0.098
## shares 0.004
## global_rate_negative_words rate_positive_words
## n_tokens_title 0.030 -0.087
## n_tokens_content 0.056 0.091
## n_unique_tokens 0.097 0.247
## n_non_stop_words 0.206 0.539
## n_non_stop_unique_tokens 0.167 0.289
## num_hrefs -0.030 0.145
## num_self_hrefs -0.061 0.123
## num_imgs -0.030 0.034
## num_videos 0.003 -0.002
## average_token_length 0.126 0.494
## num_keywords -0.004 0.022
## kw_min_min -0.050 0.042
## kw_max_min -0.026 -0.010
## kw_avg_min -0.032 0.009
## kw_min_max 0.035 -0.037
## kw_max_max 0.045 -0.040
## kw_avg_max 0.092 -0.139
## kw_min_avg 0.050 -0.050
## kw_max_avg 0.012 -0.016
## kw_avg_avg 0.063 -0.068
## self_reference_min_shares 0.072 -0.045
## self_reference_max_shares 0.047 -0.015
## self_reference_avg_sharess 0.065 -0.032
## weekday_is_monday -0.049 0.006
## weekday_is_tuesday 0.029 -0.009
## weekday_is_wednesday 0.024 -0.015
## weekday_is_thursday 0.021 -0.046
## weekday_is_friday -0.039 0.040
## weekday_is_saturday 0.004 0.020
## weekday_is_sunday 0.009 0.015
## is_weekend 0.010 0.026
## LDA_00 -0.002 0.048
## LDA_01 0.070 -0.045
## LDA_02 0.005 -0.027
## LDA_03 0.074 -0.106
## LDA_04 -0.074 0.057
## global_subjectivity 0.170 0.340
## global_sentiment_polarity -0.436 0.709
## global_rate_positive_words 0.071 0.570
## global_rate_negative_words 1.000 -0.546
## rate_positive_words -0.546 1.000
## rate_negative_words 0.807 -0.695
## avg_positive_polarity 0.142 0.305
## min_positive_polarity 0.044 -0.022
## max_positive_polarity 0.140 0.356
## avg_negative_polarity -0.189 -0.006
## min_negative_polarity -0.416 0.132
## max_negative_polarity 0.219 -0.209
## title_subjectivity 0.041 -0.029
## title_sentiment_polarity -0.106 0.113
## abs_title_subjectivity -0.032 -0.049
## abs_title_sentiment_polarity -0.004 -0.013
## shares 0.025 -0.032
## rate_negative_words avg_positive_polarity
## n_tokens_title 0.059 -0.108
## n_tokens_content -0.002 0.086
## n_unique_tokens 0.162 0.232
## n_non_stop_words 0.231 0.486
## n_non_stop_unique_tokens 0.202 0.253
## num_hrefs -0.057 0.246
## num_self_hrefs -0.059 0.026
## num_imgs -0.048 0.194
## num_videos -0.002 0.027
## average_token_length 0.182 0.370
## num_keywords -0.047 0.115
## kw_min_min -0.044 -0.053
## kw_max_min -0.028 -0.041
## kw_avg_min -0.033 -0.047
## kw_min_max 0.030 0.000
## kw_max_max 0.044 0.051
## kw_avg_max 0.091 -0.004
## kw_min_avg 0.057 0.088
## kw_max_avg -0.006 0.020
## kw_avg_avg 0.031 0.068
## self_reference_min_shares 0.090 0.033
## self_reference_max_shares 0.043 0.033
## self_reference_avg_sharess 0.070 0.037
## weekday_is_monday -0.028 0.000
## weekday_is_tuesday 0.020 -0.051
## weekday_is_wednesday 0.033 -0.023
## weekday_is_thursday 0.023 -0.037
## weekday_is_friday -0.023 -0.030
## weekday_is_saturday -0.015 0.052
## weekday_is_sunday -0.021 0.123
## is_weekend -0.027 0.133
## LDA_00 -0.052 0.045
## LDA_01 0.077 -0.030
## LDA_02 0.034 -0.109
## LDA_03 0.028 0.147
## LDA_04 -0.012 -0.090
## global_subjectivity 0.060 0.608
## global_sentiment_polarity -0.659 0.558
## global_rate_positive_words -0.389 0.271
## global_rate_negative_words 0.807 0.142
## rate_positive_words -0.695 0.305
## rate_negative_words 1.000 0.063
## avg_positive_polarity 0.063 1.000
## min_positive_polarity 0.160 0.381
## max_positive_polarity -0.045 0.629
## avg_negative_polarity -0.219 -0.146
## min_negative_polarity -0.345 -0.141
## max_negative_polarity 0.133 -0.046
## title_subjectivity -0.040 0.070
## title_sentiment_polarity -0.166 0.091
## abs_title_subjectivity 0.053 -0.015
## abs_title_sentiment_polarity -0.075 0.093
## shares 0.017 0.003
## min_positive_polarity max_positive_polarity
## n_tokens_title -0.038 -0.046
## n_tokens_content -0.273 0.347
## n_unique_tokens 0.392 -0.098
## n_non_stop_words 0.158 0.429
## n_non_stop_unique_tokens 0.287 0.021
## num_hrefs -0.092 0.313
## num_self_hrefs -0.084 0.109
## num_imgs -0.016 0.184
## num_videos 0.029 0.041
## average_token_length 0.151 0.297
## num_keywords -0.049 0.135
## kw_min_min 0.022 -0.132
## kw_max_min -0.022 0.000
## kw_avg_min -0.014 -0.018
## kw_min_max 0.006 0.039
## kw_max_max -0.045 0.136
## kw_avg_max -0.011 0.056
## kw_min_avg 0.069 0.077
## kw_max_avg -0.005 0.074
## kw_avg_avg 0.013 0.121
## self_reference_min_shares 0.025 0.043
## self_reference_max_shares -0.024 0.067
## self_reference_avg_sharess -0.005 0.062
## weekday_is_monday -0.008 -0.002
## weekday_is_tuesday -0.048 -0.047
## weekday_is_wednesday 0.015 -0.023
## weekday_is_thursday 0.015 -0.048
## weekday_is_friday 0.005 0.008
## weekday_is_saturday -0.014 0.044
## weekday_is_sunday 0.037 0.098
## is_weekend 0.019 0.108
## LDA_00 -0.080 0.087
## LDA_01 0.034 -0.030
## LDA_02 -0.015 -0.046
## LDA_03 0.070 0.090
## LDA_04 0.014 -0.110
## global_subjectivity 0.202 0.418
## global_sentiment_polarity 0.063 0.453
## global_rate_positive_words -0.161 0.441
## global_rate_negative_words 0.044 0.140
## rate_positive_words -0.022 0.356
## rate_negative_words 0.160 -0.045
## avg_positive_polarity 0.381 0.629
## min_positive_polarity 1.000 -0.094
## max_positive_polarity -0.094 1.000
## avg_negative_polarity 0.018 -0.148
## min_negative_polarity 0.175 -0.323
## max_negative_polarity -0.134 0.111
## title_subjectivity -0.021 0.059
## title_sentiment_polarity 0.009 0.064
## abs_title_subjectivity 0.020 -0.039
## abs_title_sentiment_polarity -0.021 0.062
## shares -0.038 -0.001
## avg_negative_polarity min_negative_polarity
## n_tokens_title -0.028 -0.048
## n_tokens_content -0.098 -0.384
## n_unique_tokens -0.114 0.194
## n_non_stop_words -0.265 -0.226
## n_non_stop_unique_tokens -0.159 0.040
## num_hrefs -0.113 -0.197
## num_self_hrefs -0.030 -0.034
## num_imgs -0.098 -0.123
## num_videos -0.016 -0.049
## average_token_length -0.196 -0.112
## num_keywords 0.019 0.014
## kw_min_min 0.042 0.100
## kw_max_min 0.012 0.006
## kw_avg_min 0.021 0.024
## kw_min_max -0.015 -0.049
## kw_max_max -0.048 -0.111
## kw_avg_max -0.093 -0.135
## kw_min_avg -0.057 -0.051
## kw_max_avg -0.037 -0.040
## kw_avg_avg -0.090 -0.102
## self_reference_min_shares -0.071 -0.081
## self_reference_max_shares -0.066 -0.063
## self_reference_avg_sharess -0.077 -0.076
## weekday_is_monday -0.008 0.009
## weekday_is_tuesday -0.013 -0.027
## weekday_is_wednesday 0.005 -0.002
## weekday_is_thursday 0.051 0.027
## weekday_is_friday -0.002 0.017
## weekday_is_saturday -0.039 -0.007
## weekday_is_sunday -0.008 -0.024
## is_weekend -0.034 -0.024
## LDA_00 -0.003 -0.037
## LDA_01 -0.010 0.005
## LDA_02 -0.004 -0.042
## LDA_03 -0.105 -0.055
## LDA_04 0.079 0.083
## global_subjectivity -0.349 -0.300
## global_sentiment_polarity 0.203 0.230
## global_rate_positive_words -0.066 -0.126
## global_rate_negative_words -0.189 -0.416
## rate_positive_words -0.006 0.132
## rate_negative_words -0.219 -0.345
## avg_positive_polarity -0.146 -0.141
## min_positive_polarity 0.018 0.175
## max_positive_polarity -0.148 -0.323
## avg_negative_polarity 1.000 0.684
## min_negative_polarity 0.684 1.000
## max_negative_polarity 0.549 -0.016
## title_subjectivity -0.066 -0.057
## title_sentiment_polarity 0.089 0.080
## abs_title_subjectivity -0.015 -0.002
## abs_title_sentiment_polarity -0.042 -0.010
## shares -0.043 -0.058
## max_negative_polarity title_subjectivity
## n_tokens_title 0.005 0.017
## n_tokens_content 0.244 -0.005
## n_unique_tokens -0.333 -0.056
## n_non_stop_words -0.128 -0.086
## n_non_stop_unique_tokens -0.238 -0.089
## num_hrefs 0.049 0.063
## num_self_hrefs 0.029 -0.010
## num_imgs 0.023 0.119
## num_videos 0.024 -0.004
## average_token_length -0.157 -0.089
## num_keywords 0.036 0.004
## kw_min_min -0.031 -0.018
## kw_max_min -0.001 0.013
## kw_avg_min -0.009 0.003
## kw_min_max 0.027 0.031
## kw_max_max 0.024 0.009
## kw_avg_max -0.025 0.076
## kw_min_avg -0.031 0.034
## kw_max_avg -0.017 0.048
## kw_avg_avg -0.026 0.090
## self_reference_min_shares -0.006 0.032
## self_reference_max_shares 0.025 0.036
## self_reference_avg_sharess 0.014 0.039
## weekday_is_monday -0.009 0.004
## weekday_is_tuesday -0.007 0.004
## weekday_is_wednesday -0.001 -0.057
## weekday_is_thursday 0.035 -0.002
## weekday_is_friday -0.006 -0.080
## weekday_is_saturday -0.020 0.091
## weekday_is_sunday 0.001 0.075
## is_weekend -0.013 0.124
## LDA_00 0.031 0.029
## LDA_01 0.001 -0.027
## LDA_02 0.043 -0.068
## LDA_03 -0.079 0.139
## LDA_04 0.013 -0.087
## global_subjectivity -0.090 0.143
## global_sentiment_polarity -0.051 0.075
## global_rate_positive_words -0.017 0.106
## global_rate_negative_words 0.219 0.041
## rate_positive_words -0.209 -0.029
## rate_negative_words 0.133 -0.040
## avg_positive_polarity -0.046 0.070
## min_positive_polarity -0.134 -0.021
## max_positive_polarity 0.111 0.059
## avg_negative_polarity 0.549 -0.066
## min_negative_polarity -0.016 -0.057
## max_negative_polarity 1.000 -0.037
## title_subjectivity -0.037 1.000
## title_sentiment_polarity -0.008 0.329
## abs_title_subjectivity -0.007 -0.477
## abs_title_sentiment_polarity -0.041 0.694
## shares 0.013 -0.005
## title_sentiment_polarity abs_title_subjectivity
## n_tokens_title -0.019 -0.103
## n_tokens_content 0.008 -0.025
## n_unique_tokens -0.052 0.023
## n_non_stop_words -0.041 -0.004
## n_non_stop_unique_tokens -0.091 0.025
## num_hrefs 0.052 0.005
## num_self_hrefs -0.030 -0.028
## num_imgs 0.098 -0.022
## num_videos 0.001 -0.026
## average_token_length -0.036 0.030
## num_keywords 0.071 0.012
## kw_min_min -0.032 -0.035
## kw_max_min 0.068 -0.008
## kw_avg_min 0.057 -0.014
## kw_min_max 0.032 0.003
## kw_max_max 0.016 0.035
## kw_avg_max 0.013 -0.007
## kw_min_avg 0.025 0.024
## kw_max_avg 0.055 0.012
## kw_avg_avg 0.080 0.011
## self_reference_min_shares -0.025 -0.030
## self_reference_max_shares -0.087 -0.037
## self_reference_avg_sharess -0.072 -0.044
## weekday_is_monday 0.004 -0.031
## weekday_is_tuesday -0.023 0.001
## weekday_is_wednesday -0.038 0.024
## weekday_is_thursday -0.028 0.011
## weekday_is_friday -0.017 0.055
## weekday_is_saturday 0.059 -0.026
## weekday_is_sunday 0.070 -0.048
## is_weekend 0.097 -0.056
## LDA_00 0.050 -0.013
## LDA_01 -0.059 -0.005
## LDA_02 -0.023 0.012
## LDA_03 0.093 0.018
## LDA_04 -0.079 -0.004
## global_subjectivity 0.055 -0.040
## global_sentiment_polarity 0.210 -0.079
## global_rate_positive_words 0.123 -0.135
## global_rate_negative_words -0.106 -0.032
## rate_positive_words 0.113 -0.049
## rate_negative_words -0.166 0.053
## avg_positive_polarity 0.091 -0.015
## min_positive_polarity 0.009 0.020
## max_positive_polarity 0.064 -0.039
## avg_negative_polarity 0.089 -0.015
## min_negative_polarity 0.080 -0.002
## max_negative_polarity -0.008 -0.007
## title_subjectivity 0.329 -0.477
## title_sentiment_polarity 1.000 -0.298
## abs_title_subjectivity -0.298 1.000
## abs_title_sentiment_polarity 0.564 -0.412
## shares -0.006 0.032
## abs_title_sentiment_polarity shares
## n_tokens_title 0.005 0.008
## n_tokens_content -0.008 0.097
## n_unique_tokens -0.062 -0.041
## n_non_stop_words -0.106 -0.024
## n_non_stop_unique_tokens -0.098 -0.017
## num_hrefs 0.052 0.058
## num_self_hrefs -0.022 -0.015
## num_imgs 0.091 0.044
## num_videos 0.022 0.085
## average_token_length -0.117 -0.020
## num_keywords 0.004 0.005
## kw_min_min -0.056 -0.040
## kw_max_min 0.062 0.013
## kw_avg_min 0.043 0.009
## kw_min_max 0.046 0.018
## kw_max_max 0.042 0.043
## kw_avg_max 0.103 0.036
## kw_min_avg 0.050 0.021
## kw_max_avg 0.065 0.044
## kw_avg_avg 0.113 0.089
## self_reference_min_shares 0.027 0.067
## self_reference_max_shares 0.032 0.016
## self_reference_avg_sharess 0.033 0.036
## weekday_is_monday 0.000 0.039
## weekday_is_tuesday 0.010 0.028
## weekday_is_wednesday -0.019 -0.025
## weekday_is_thursday 0.002 -0.003
## weekday_is_friday -0.076 -0.027
## weekday_is_saturday 0.049 -0.009
## weekday_is_sunday 0.053 -0.005
## is_weekend 0.076 -0.010
## LDA_00 0.046 0.032
## LDA_01 -0.056 -0.016
## LDA_02 -0.058 -0.036
## LDA_03 0.133 0.064
## LDA_04 -0.092 -0.053
## global_subjectivity 0.089 0.017
## global_sentiment_polarity 0.111 -0.021
## global_rate_positive_words 0.098 0.004
## global_rate_negative_words -0.004 0.025
## rate_positive_words -0.013 -0.032
## rate_negative_words -0.075 0.017
## avg_positive_polarity 0.093 0.003
## min_positive_polarity -0.021 -0.038
## max_positive_polarity 0.062 -0.001
## avg_negative_polarity -0.042 -0.043
## min_negative_polarity -0.010 -0.058
## max_negative_polarity -0.041 0.013
## title_subjectivity 0.694 -0.005
## title_sentiment_polarity 0.564 -0.006
## abs_title_subjectivity -0.412 0.032
## abs_title_sentiment_polarity 1.000 -0.003
## shares -0.003 1.000
correlations <-as_tibble(corr)
# correlations <- (rownames = attributes(training_data)$names)
corr_mat <- bind_cols(variables, correlations)
correlation_matrix <- column_to_rownames(corr_mat, var = "variable")
shares_corr<- correlation_matrix %>% select(shares)
shares_corr
shares_strongest_corr <- shares_corr%>%
filter(abs(shares) > 0.025 & abs(shares) !=1) %>%
rownames_to_column(var = "predictor") %>%
arrange(desc(abs(shares)))
shares_strongest_corr
# Generates scatter plots for the strongest correlations with shares
listPred <- as.list(shares_strongest_corr$predictor)
Our idea is in part that what makes a link shareable is how easy it is for the content to be consumed. People want to be spoon fed information. We will test this out via proxy’s. We will measure shares against average key words(kw_avg_avg), average length of words (average_token_length), average number of words in the content (n_tokens_content), and number of words in the title (n_tokens_title). The idea here is to measure both the quantity of words as well as the complexity of the content. i.e. an article with 500 “easy” words could be shared more than an article with 100 “difficult” words.
Now let’s clean our data. If we have any outliers we will remove them first to get an idea of what the bulk of shares come from. We will follow what the boxplot tells us when choosing what to remove.
boxplot(subset_data$shares,horizontal = TRUE, range = 2, main = "Boxplot of shares with outliers")
boxplot(subset_data$shares,horizontal = TRUE, range = 2, outline = FALSE,main = "Boxplot of shares without outliers")
# We can have some pretty extreme values
IQR <- quantile(subset_data$shares)[4] - quantile(subset_data$shares)[2]
upper_limit <- quantile(subset_data$shares)[4] + (1.5 * IQR)
lower_limit <- quantile(subset_data$shares)[2] - (1.5 * IQR)
subset_data_wo_outliers <- subset_data %>% filter(shares <= upper_limit & shares >= lower_limit)
After we remove any potential outliers to our data our we can compare shares our key metrics.
correlation1 <- cor(subset_data_wo_outliers$shares,subset_data_wo_outliers$kw_avg_avg)
plot1 <- ggplot(subset_data_wo_outliers, aes(y= shares,x = kw_avg_avg)) +
geom_point() +
geom_smooth() +
labs(title = "Number of shares vs. Average number of key words", y= "# of shares", x = "Average # of key words") +
geom_text(color = "red",x=15000,y=5000,label = paste0("Correlation = ",round(correlation1,3)))
plot1
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
We can measure the trend of shares as a function of Average number of key words. If we see a possitive trend we can say that the more key words in the articles the more likely it is to be shared, the opposite can also be said. We measure the correlation to get a more precise gauge in case the graph is not clear enough.
correlation2 <- cor(subset_data_wo_outliers$shares,subset_data_wo_outliers$average_token_length)
plot2 <- ggplot(subset_data_wo_outliers, aes(y= shares,x = average_token_length)) +
geom_density_2d() +
labs(title = "number of shares vs. Average length of words in content", y= "# of shares", x = "Average length of words in content") +
geom_text(color = "red",x=5,y=3500,label = paste0("Correlation = ",round(correlation2,3)))
plot2
With a density plot as a function of average length of words in content we see where most of our shares come from. We can utilize this to help explain our model down below.
correlation3 <- cor(subset_data_wo_outliers$shares,subset_data_wo_outliers$n_tokens_content)
plot3 <- ggplot(subset_data_wo_outliers, aes(y= shares,x = n_tokens_content)) +
geom_rug() +
labs(title = "number of shares vs. number of words in content", y= "# of shares", x = "# of words in content") +
geom_text(color = "red",x=4000,y=4000,label = paste0("Correlation = ",round(correlation3,3)))
plot3
Using a rug graph we can measure the relationship between number of words in content and the number of shares. The intersection between where both rugs are highly concentrated is where how we can measure correlation. If both rugs are concentrated near zero than we see that the less words the more shareable the articles are or vice versa.
correlation4 <- cor(subset_data_wo_outliers$shares,subset_data_wo_outliers$n_tokens_title)
plot4 <- ggplot(subset_data_wo_outliers, aes(y= shares,x = n_tokens_title)) +
geom_col() +
labs(title = "number of shares vs. number of words in title", y= "# of shares", x = "# of words in title") +
geom_text(color = "red",x=15,y=600000,label = paste0("Correlation = ",round(correlation4,3)))
plot4
Here the correlation matrix with subset into shares vs predictor correlations and filtered at a threshold correlation value of 0.025 is used to generate plots for each shares by each predictor that meets the threshold.
#Note need to probably name graphs still etc...work in progress
corP <- function(x) {
var1 <- get(x, training_data)
plot(var1, training_data$shares)
}
lapply(listPred, corP)
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
##
## [[6]]
## NULL
##
## [[7]]
## NULL
##
## [[8]]
## NULL
##
## [[9]]
## NULL
##
## [[10]]
## NULL
##
## [[11]]
## NULL
##
## [[12]]
## NULL
##
## [[13]]
## NULL
##
## [[14]]
## NULL
##
## [[15]]
## NULL
##
## [[16]]
## NULL
##
## [[17]]
## NULL
##
## [[18]]
## NULL
##
## [[19]]
## NULL
##
## [[20]]
## NULL
##
## [[21]]
## NULL
##
## [[22]]
## NULL
##
## [[23]]
## NULL
##
## [[24]]
## NULL
##
## [[25]]
## NULL
corNoOut<- function(x) {
var1 <- get(x, subset_data_wo_outliers)
plot(var1, subset_data_wo_outliers$shares)
}
lapply(listPred, corNoOut)
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
##
## [[6]]
## NULL
##
## [[7]]
## NULL
##
## [[8]]
## NULL
##
## [[9]]
## NULL
##
## [[10]]
## NULL
##
## [[11]]
## NULL
##
## [[12]]
## NULL
##
## [[13]]
## NULL
##
## [[14]]
## NULL
##
## [[15]]
## NULL
##
## [[16]]
## NULL
##
## [[17]]
## NULL
##
## [[18]]
## NULL
##
## [[19]]
## NULL
##
## [[20]]
## NULL
##
## [[21]]
## NULL
##
## [[22]]
## NULL
##
## [[23]]
## NULL
##
## [[24]]
## NULL
##
## [[25]]
## NULL